In [2]:
import pandas as pd
import seaborn as sns
housing = pd.read_csv('Files/house.csv')
housing
Out[2]:
| Avg. Area Income | Avg. Area House Age | Avg. Area Number of Rooms | Avg. Area Number of Bedrooms | Area Population | Price | |
|---|---|---|---|---|---|---|
| 0 | 79545.45857 | 5.682861 | 7.009188 | 4.09 | 23086.80050 | 1.059034e+06 |
| 1 | 79248.64245 | 6.002900 | 6.730821 | 3.09 | 40173.07217 | 1.505891e+06 |
| 2 | 61287.06718 | 5.865890 | 8.512727 | 5.13 | 36882.15940 | 1.058988e+06 |
| 3 | 63345.24005 | 7.188236 | 5.586729 | 3.26 | 34310.24283 | 1.260617e+06 |
| 4 | 59982.19723 | 5.040555 | 7.839388 | 4.23 | 26354.10947 | 6.309435e+05 |
| ... | ... | ... | ... | ... | ... | ... |
| 4995 | 60567.94414 | 7.830362 | 6.137356 | 3.46 | 22837.36103 | 1.060194e+06 |
| 4996 | 78491.27543 | 6.999135 | 6.576763 | 4.02 | 25616.11549 | 1.482618e+06 |
| 4997 | 63390.68689 | 7.250591 | 4.805081 | 2.13 | 33266.14549 | 1.030730e+06 |
| 4998 | 68001.33124 | 5.534388 | 7.130144 | 5.44 | 42625.62016 | 1.198657e+06 |
| 4999 | 65510.58180 | 5.992305 | 6.792336 | 4.07 | 46501.28380 | 1.298950e+06 |
5000 rows × 6 columns
In [4]:
# Check null
housing.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 5000 entries, 0 to 4999 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Avg. Area Income 5000 non-null float64 1 Avg. Area House Age 5000 non-null float64 2 Avg. Area Number of Rooms 5000 non-null float64 3 Avg. Area Number of Bedrooms 5000 non-null float64 4 Area Population 5000 non-null float64 5 Price 5000 non-null float64 dtypes: float64(6) memory usage: 234.5 KB
In [5]:
# plot price
sns.distplot(housing["Price"])
C:\Users\ohm\AppData\Local\Temp\ipykernel_10268\3557035226.py:2: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(housing["Price"])
Out[5]:
<Axes: xlabel='Price', ylabel='Density'>
In [28]:
housing.columns
Out[28]:
Index(['Avg. Area Income', 'Avg. Area House Age', 'Avg. Area Number of Rooms',
'Avg. Area Number of Bedrooms', 'Area Population', 'Price'],
dtype='object')
In [6]:
sns.heatmap(housing.corr(), annot=True)
Out[6]:
<Axes: >
A.1.2 Split data set to training data and testing data¶
In [7]:
from sklearn.model_selection import train_test_split
housing.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 5000 entries, 0 to 4999 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Avg. Area Income 5000 non-null float64 1 Avg. Area House Age 5000 non-null float64 2 Avg. Area Number of Rooms 5000 non-null float64 3 Avg. Area Number of Bedrooms 5000 non-null float64 4 Area Population 5000 non-null float64 5 Price 5000 non-null float64 dtypes: float64(6) memory usage: 234.5 KB
In [8]:
# use only 0-4 cause that is independent variable , 5 is variale dependent
x=housing.iloc[:,0:4]
x.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 5000 entries, 0 to 4999 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Avg. Area Income 5000 non-null float64 1 Avg. Area House Age 5000 non-null float64 2 Avg. Area Number of Rooms 5000 non-null float64 3 Avg. Area Number of Bedrooms 5000 non-null float64 dtypes: float64(4) memory usage: 156.4 KB
In [9]:
y=housing['Price']
y
Out[9]:
0 1.059034e+06
1 1.505891e+06
2 1.058988e+06
3 1.260617e+06
4 6.309435e+05
...
4995 1.060194e+06
4996 1.482618e+06
4997 1.030730e+06
4998 1.198657e+06
4999 1.298950e+06
Name: Price, Length: 5000, dtype: float64
In [10]:
# test 40% there rest 60% keep to train
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.4)
A.1.3 Build the linear regressing model¶
In [11]:
from sklearn.linear_model import LinearRegression
# create model object
lm = LinearRegression()
#use fit fuction , add X_train,y_train in model that let result to out put
lm.fit(X_train,y_train)
Out[11]:
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression()
In [12]:
# check completed model (y-intercept point incase parameter is 0)
lm.intercept_
Out[12]:
np.float64(-1977586.8477489694)
In [13]:
# slope value has 4 values cause independent is 4
coeff_df = pd.DataFrame(lm.coef_,x.columns,columns=['Coefficient'])
lm.coef_
Out[13]:
array([ 2.11245355e+01, 1.60035916e+05, 1.17002857e+05, -2.80666888e+03])
In [14]:
coeff_df
Out[14]:
| Coefficient | |
|---|---|
| Avg. Area Income | 21.124536 |
| Avg. Area House Age | 160035.915630 |
| Avg. Area Number of Rooms | 117002.856909 |
| Avg. Area Number of Bedrooms | -2806.668878 |
A.1.4 Compare between original data and the result of prediction from the model¶
In [15]:
# y >>> predictions
y_predictions = lm.predict(X_test)
y_predictions
Out[15]:
array([1646811.75726285, 1291341.97500786, 1314162.0760482 , ...,
979598.80850084, 979285.38547021, 1254305.49153467])
In [16]:
# y_test
y_test
Out[16]:
2638 1.735637e+06
2541 1.499243e+06
1911 1.237116e+06
4129 2.187326e+06
807 7.048842e+05
...
2214 1.570905e+06
3858 1.491145e+06
2030 1.055153e+06
1115 8.749697e+05
2390 1.409319e+06
Name: Price, Length: 2000, dtype: float64
In [17]:
# check y_test and y_predictions by plot
import matplotlib.pyplot as plt
plt.scatter(y_test,y_predictions)
Out[17]:
<matplotlib.collections.PathCollection at 0x27c873c1bd0>
In [18]:
# check y_test and y_predictions by histograms
sns.distplot(y_test-y_predictions)
C:\Users\ohm\AppData\Local\Temp\ipykernel_10268\1757013246.py:2: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(y_test-y_predictions)
Out[18]:
<Axes: xlabel='Price', ylabel='Density'>
In [19]:
y_test == y_predictions
Out[19]:
2638 False
2541 False
1911 False
4129 False
807 False
...
2214 False
3858 False
2030 False
1115 False
2390 False
Name: Price, Length: 2000, dtype: bool
In [20]:
sns.distplot((y_test-y_predictions),kde=False,bins=10);
C:\Users\ohm\AppData\Local\Temp\ipykernel_10268\3678181030.py:1: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot((y_test-y_predictions),kde=False,bins=10);
A.1.5 Calculate the error of the model¶
In [21]:
# y_test and y_predictions difference about 140000
from sklearn import metrics
print('MAE:', metrics.mean_absolute_error(y_test, y_predictions))
MAE: 144574.49079237555
In [22]:
# a lot values cause **2
print('MSE:', metrics.mean_squared_error(y_test, y_predictions))
MSE: 33263762658.068634
In [23]:
# use sqrt about 180000
import numpy as np
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_predictions)))
RMSE: 182383.55917699554
In [24]:
import pandas as pd
df = pd.read_excel('AA.xlsx')
df.head()
Out[24]:
| Class | Alcohol | Malic acid | Ash | Alcalinity of ash | Magnesium | Total phenols | Flavanoids | Nonflavanoid phenols | Proanthocyanins | Color intensity | Hue | OD280/OD315 of diluted wines | Proline | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | WineA | 14.23 | 1.71 | 2.43 | 15.6 | 127 | 2.80 | 3.06 | 0.28 | 2.29 | 5.64 | 1.04 | 3.92 | 1065 |
| 1 | WineA | 13.20 | 1.78 | 2.14 | 11.2 | 100 | 2.65 | 2.76 | 0.26 | 1.28 | 4.38 | 1.05 | 3.40 | 1050 |
| 2 | WineA | 13.16 | 2.36 | 2.67 | 18.6 | 101 | 2.80 | 3.24 | 0.30 | 2.81 | 5.68 | 1.03 | 3.17 | 1185 |
| 3 | WineA | 14.37 | 1.95 | 2.50 | 16.8 | 113 | 3.85 | 3.49 | 0.24 | 2.18 | 7.80 | 0.86 | 3.45 | 1480 |
| 4 | WineA | 13.24 | 2.59 | 2.87 | 21.0 | 118 | 2.80 | 2.69 | 0.39 | 1.82 | 4.32 | 1.04 | 2.93 | 735 |
In [25]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 131 entries, 0 to 130 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Class 131 non-null object 1 Alcohol 131 non-null float64 2 Malic acid 131 non-null float64 3 Ash 131 non-null float64 4 Alcalinity of ash 131 non-null float64 5 Magnesium 131 non-null int64 6 Total phenols 131 non-null float64 7 Flavanoids 131 non-null float64 8 Nonflavanoid phenols 131 non-null float64 9 Proanthocyanins 131 non-null float64 10 Color intensity 131 non-null float64 11 Hue 131 non-null float64 12 OD280/OD315 of diluted wines 131 non-null float64 13 Proline 131 non-null int64 dtypes: float64(11), int64(2), object(1) memory usage: 14.5+ KB
In [26]:
# drop column
x = df.drop('Class',axis=1) # คลาสจะหายไป
y = df['Class']
In [27]:
x
Out[27]:
| Alcohol | Malic acid | Ash | Alcalinity of ash | Magnesium | Total phenols | Flavanoids | Nonflavanoid phenols | Proanthocyanins | Color intensity | Hue | OD280/OD315 of diluted wines | Proline | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 14.23 | 1.71 | 2.43 | 15.6 | 127 | 2.80 | 3.06 | 0.28 | 2.29 | 5.64 | 1.04 | 3.92 | 1065 |
| 1 | 13.20 | 1.78 | 2.14 | 11.2 | 100 | 2.65 | 2.76 | 0.26 | 1.28 | 4.38 | 1.05 | 3.40 | 1050 |
| 2 | 13.16 | 2.36 | 2.67 | 18.6 | 101 | 2.80 | 3.24 | 0.30 | 2.81 | 5.68 | 1.03 | 3.17 | 1185 |
| 3 | 14.37 | 1.95 | 2.50 | 16.8 | 113 | 3.85 | 3.49 | 0.24 | 2.18 | 7.80 | 0.86 | 3.45 | 1480 |
| 4 | 13.24 | 2.59 | 2.87 | 21.0 | 118 | 2.80 | 2.69 | 0.39 | 1.82 | 4.32 | 1.04 | 2.93 | 735 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 126 | 12.07 | 2.16 | 2.17 | 21.0 | 85 | 2.60 | 2.65 | 0.37 | 1.35 | 2.76 | 0.86 | 3.28 | 378 |
| 127 | 12.43 | 1.53 | 2.29 | 21.5 | 86 | 2.74 | 3.15 | 0.39 | 1.77 | 3.94 | 0.69 | 2.84 | 352 |
| 128 | 11.79 | 2.13 | 2.78 | 28.5 | 92 | 2.13 | 2.24 | 0.58 | 1.76 | 3.00 | 0.97 | 2.44 | 466 |
| 129 | 12.37 | 1.63 | 2.30 | 24.5 | 88 | 2.22 | 2.45 | 0.40 | 1.90 | 2.12 | 0.89 | 2.78 | 342 |
| 130 | 12.04 | 4.30 | 2.38 | 22.0 | 80 | 2.10 | 1.75 | 0.42 | 1.35 | 2.60 | 0.79 | 2.57 | 580 |
131 rows × 13 columns
In [28]:
y
Out[28]:
0 WineA
1 WineA
2 WineA
3 WineA
4 WineA
...
126 WineB
127 WineB
128 WineB
129 WineB
130 WineB
Name: Class, Length: 131, dtype: object
In [29]:
y.describe()
Out[29]:
count 131 unique 2 top WineB freq 71 Name: Class, dtype: object
In [30]:
import numpy as np
np.unique(y)
Out[30]:
array(['WineA', 'WineB'], dtype=object)
A.2.2 Adjust data scaling¶
In [31]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x = scaler.fit_transform(x)
pd.DataFrame(x).describe()
Out[31]:
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1.310000e+02 | 1.310000e+02 | 1.310000e+02 | 1.310000e+02 | 1.310000e+02 | 1.310000e+02 | 1.310000e+02 | 1.310000e+02 | 1.310000e+02 | 1.310000e+02 | 1.310000e+02 | 1.310000e+02 | 1.310000e+02 |
| mean | 3.634074e-15 | -3.491694e-16 | -1.084798e-15 | 1.627197e-16 | 3.254394e-16 | -4.067993e-16 | 3.254394e-16 | 1.627197e-16 | -1.016998e-16 | 2.711995e-16 | -1.694997e-15 | -5.423990e-17 | -1.084798e-16 |
| std | 1.003839e+00 | 1.003839e+00 | 1.003839e+00 | 1.003839e+00 | 1.003839e+00 | 1.003839e+00 | 1.003839e+00 | 1.003839e+00 | 1.003839e+00 | 1.003839e+00 | 1.003839e+00 | 1.003839e+00 | 1.003839e+00 |
| min | -2.169454e+00 | -1.398315e+00 | -3.324307e+00 | -2.423456e+00 | -1.954514e+00 | -2.580426e+00 | -2.581719e+00 | -1.832669e+00 | -2.474022e+00 | -1.806214e+00 | -2.164089e+00 | -3.348390e+00 | -1.459258e+00 |
| 25% | -8.032242e-01 | -5.239100e-01 | -5.775213e-01 | -6.605415e-01 | -7.844851e-01 | -7.202807e-01 | -6.898889e-01 | -6.373281e-01 | -6.039474e-01 | -8.175845e-01 | -7.178632e-01 | -4.392664e-01 | -8.936873e-01 |
| 50% | 5.279530e-02 | -2.667321e-01 | -6.885731e-02 | -8.277973e-02 | -1.344690e-01 | 8.213483e-02 | 9.386945e-02 | -2.695308e-01 | -8.551092e-02 | -1.822585e-01 | -3.902263e-02 | 3.714072e-02 | -2.138605e-01 |
| 75% | 8.861388e-01 | 1.047471e-01 | 6.263168e-01 | 6.283117e-01 | 5.155471e-01 | 7.477750e-01 | 6.952013e-01 | 6.039877e-01 | 4.699567e-01 | 7.164954e-01 | 6.398180e-01 | 7.264105e-01 | 7.816003e-01 |
| max | 2.138989e+00 | 4.385331e+00 | 3.017037e+00 | 3.324534e+00 | 4.025634e+00 | 2.489381e+00 | 3.512677e+00 | 3.040645e+00 | 3.395420e+00 | 2.916893e+00 | 3.856932e+00 | 2.145495e+00 | 2.545437e+00 |
A.2.3 Split data set to training data and testing data¶
In [32]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3)
A.2.4. Build the linear regressing model¶
In [33]:
# create model logmodel
from sklearn.linear_model import LogisticRegression
logmodel = LogisticRegression()
In [34]:
# train model
logmodel.fit(X_train,y_train)
Out[34]:
LogisticRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression()
In [35]:
# must scaling cause scale is difference (std is swing)
df.describe()
Out[35]:
| Alcohol | Malic acid | Ash | Alcalinity of ash | Magnesium | Total phenols | Flavanoids | Nonflavanoid phenols | Proanthocyanins | Color intensity | Hue | OD280/OD315 of diluted wines | Proline | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 131.000000 | 131.000000 | 131.000000 | 131.000000 | 131.000000 | 131.000000 | 131.000000 | 131.000000 | 131.000000 | 131.000000 | 131.000000 | 131.000000 | 131.000000 |
| mean | 12.943435 | 1.963359 | 2.340305 | 18.779389 | 100.068702 | 2.514962 | 2.480534 | 0.329313 | 1.746183 | 4.194046 | 1.056611 | 2.941679 | 788.870229 |
| std | 0.885375 | 0.878239 | 0.296022 | 3.388050 | 15.443291 | 0.550449 | 0.742865 | 0.109173 | 0.542159 | 1.619538 | 0.170057 | 0.495169 | 351.433053 |
| min | 11.030000 | 0.740000 | 1.360000 | 10.600000 | 70.000000 | 1.100000 | 0.570000 | 0.130000 | 0.410000 | 1.280000 | 0.690000 | 1.290000 | 278.000000 |
| 25% | 12.235000 | 1.505000 | 2.170000 | 16.550000 | 88.000000 | 2.120000 | 1.970000 | 0.260000 | 1.420000 | 2.875000 | 0.935000 | 2.725000 | 476.000000 |
| 50% | 12.990000 | 1.730000 | 2.320000 | 18.500000 | 98.000000 | 2.560000 | 2.550000 | 0.300000 | 1.700000 | 3.900000 | 1.050000 | 2.960000 | 714.000000 |
| 75% | 13.725000 | 2.055000 | 2.525000 | 20.900000 | 108.000000 | 2.925000 | 2.995000 | 0.395000 | 2.000000 | 5.350000 | 1.165000 | 3.300000 | 1062.500000 |
| max | 14.830000 | 5.800000 | 3.230000 | 30.000000 | 162.000000 | 3.880000 | 5.080000 | 0.660000 | 3.580000 | 8.900000 | 1.710000 | 4.000000 | 1680.000000 |
In [36]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
x = scaler.fit_transform(x)
x
Out[36]:
array([[ 1.45870817, -0.28959238, 0.30416292, ..., -0.09805225,
1.98331431, 0.78874133],
[ 0.29089345, -0.20958147, -0.67925406, ..., -0.03902263,
0.92913695, 0.7458951 ],
[ 0.24554142, 0.45336608, 1.11802525, ..., -0.15708186,
0.46286619, 1.13151118],
...,
[-1.30776554, 0.19047308, 1.49104549, ..., -0.51125956,
-1.01703665, -0.9222515 ],
[-0.65016113, -0.38103342, -0.13667917, ..., -0.98349649,
-0.32776683, -1.27644701],
[-1.02431536, 2.67081133, 0.13460827, ..., -1.57379265,
-0.75349231, -0.59662014]])
In [37]:
# make to Frame
xx =pd.DataFrame(x)
In [38]:
# can see std near 1
xx.describe()
Out[38]:
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1.310000e+02 | 1.310000e+02 | 131.000000 | 1.310000e+02 | 131.000000 | 1.310000e+02 | 1.310000e+02 | 1.310000e+02 | 1.310000e+02 | 1.310000e+02 | 1.310000e+02 | 1.310000e+02 | 1.310000e+02 |
| mean | 5.423990e-17 | -1.016998e-17 | 0.000000 | 1.084798e-16 | 0.000000 | 5.423990e-17 | -2.711995e-17 | 2.711995e-17 | -2.033996e-17 | 1.084798e-16 | 1.694997e-17 | -5.423990e-17 | -5.423990e-17 |
| std | 1.003839e+00 | 1.003839e+00 | 1.003839 | 1.003839e+00 | 1.003839 | 1.003839e+00 | 1.003839e+00 | 1.003839e+00 | 1.003839e+00 | 1.003839e+00 | 1.003839e+00 | 1.003839e+00 | 1.003839e+00 |
| min | -2.169454e+00 | -1.398315e+00 | -3.324307 | -2.423456e+00 | -1.954514 | -2.580426e+00 | -2.581719e+00 | -1.832669e+00 | -2.474022e+00 | -1.806214e+00 | -2.164089e+00 | -3.348390e+00 | -1.459258e+00 |
| 25% | -8.032242e-01 | -5.239100e-01 | -0.577521 | -6.605415e-01 | -0.784485 | -7.202807e-01 | -6.898889e-01 | -6.373281e-01 | -6.039474e-01 | -8.175845e-01 | -7.178632e-01 | -4.392664e-01 | -8.936873e-01 |
| 50% | 5.279530e-02 | -2.667321e-01 | -0.068857 | -8.277973e-02 | -0.134469 | 8.213483e-02 | 9.386945e-02 | -2.695308e-01 | -8.551092e-02 | -1.822585e-01 | -3.902263e-02 | 3.714072e-02 | -2.138605e-01 |
| 75% | 8.861388e-01 | 1.047471e-01 | 0.626317 | 6.283117e-01 | 0.515547 | 7.477750e-01 | 6.952013e-01 | 6.039877e-01 | 4.699567e-01 | 7.164954e-01 | 6.398180e-01 | 7.264105e-01 | 7.816003e-01 |
| max | 2.138989e+00 | 4.385331e+00 | 3.017037 | 3.324534e+00 | 4.025634 | 2.489381e+00 | 3.512677e+00 | 3.040645e+00 | 3.395420e+00 | 2.916893e+00 | 3.856932e+00 | 2.145495e+00 | 2.545437e+00 |
A.2.5 Compare between original data and the result of prediction from the model¶
In [39]:
y_predictions = logmodel.predict(X_test)
y_predictions
len(y_predictions)
y_predictions
Out[39]:
array(['WineB', 'WineA', 'WineB', 'WineA', 'WineB', 'WineB', 'WineB',
'WineB', 'WineA', 'WineB', 'WineB', 'WineB', 'WineA', 'WineA',
'WineB', 'WineA', 'WineB', 'WineA', 'WineB', 'WineA', 'WineA',
'WineB', 'WineB', 'WineB', 'WineB', 'WineA', 'WineA', 'WineA',
'WineB', 'WineA', 'WineB', 'WineA', 'WineA', 'WineB', 'WineB',
'WineB', 'WineA', 'WineB', 'WineB', 'WineA'], dtype=object)
A.2.6 Evaluate the model with confusion metrix¶
In [40]:
# check errer
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_predictions))
[[17 0] [ 0 23]]
In [41]:
print(classification_report(y_test, y_predictions))
precision recall f1-score support
WineA 1.00 1.00 1.00 17
WineB 1.00 1.00 1.00 23
accuracy 1.00 40
macro avg 1.00 1.00 1.00 40
weighted avg 1.00 1.00 1.00 40
that mean¶
- precision is Prediction accuracy (ความแม่นในการทนาย)
- recall ใน A 100 สามารถระบุได้เลยว่าเป็น 97 ตัว ส่วนอีก 3 ระบุไม่ได้ / ส่วน B 100 สามารถระบุได้เเค่ 98
- support ข้อมูลทั้งหมดมีไวน์ A 20 B 20
In [65]:
import pandas as pd
df = pd.read_excel('BB.xlsx')
df.head()
Out[65]:
| Class | Alcohol | Malic acid | Ash | Alcalinity of ash | Magnesium | Total phenols | Flavanoids | Nonflavanoid phenols | Proanthocyanins | Color intensity | Hue | OD280/OD315 of diluted wines | Proline | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Type1 | 14.23 | 1.71 | 2.43 | 15.6 | 127 | 2.80 | 3.06 | 0.28 | 2.29 | 5.64 | 1.04 | 3.92 | 1065 |
| 1 | Type1 | 13.20 | 1.78 | 2.14 | 11.2 | 100 | 2.65 | 2.76 | 0.26 | 1.28 | 4.38 | 1.05 | 3.40 | 1050 |
| 2 | Type1 | 13.16 | 2.36 | 2.67 | 18.6 | 101 | 2.80 | 3.24 | 0.30 | 2.81 | 5.68 | 1.03 | 3.17 | 1185 |
| 3 | Type1 | 14.37 | 1.95 | 2.50 | 16.8 | 113 | 3.85 | 3.49 | 0.24 | 2.18 | 7.80 | 0.86 | 3.45 | 1480 |
| 4 | Type1 | 13.24 | 2.59 | 2.87 | 21.0 | 118 | 2.80 | 2.69 | 0.39 | 1.82 | 4.32 | 1.04 | 2.93 | 735 |
In [66]:
x=df.drop('Class',axis=1)
y=df['Class']
In [67]:
x
Out[67]:
| Alcohol | Malic acid | Ash | Alcalinity of ash | Magnesium | Total phenols | Flavanoids | Nonflavanoid phenols | Proanthocyanins | Color intensity | Hue | OD280/OD315 of diluted wines | Proline | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 14.23 | 1.71 | 2.43 | 15.6 | 127 | 2.80 | 3.06 | 0.28 | 2.29 | 5.64 | 1.04 | 3.92 | 1065 |
| 1 | 13.20 | 1.78 | 2.14 | 11.2 | 100 | 2.65 | 2.76 | 0.26 | 1.28 | 4.38 | 1.05 | 3.40 | 1050 |
| 2 | 13.16 | 2.36 | 2.67 | 18.6 | 101 | 2.80 | 3.24 | 0.30 | 2.81 | 5.68 | 1.03 | 3.17 | 1185 |
| 3 | 14.37 | 1.95 | 2.50 | 16.8 | 113 | 3.85 | 3.49 | 0.24 | 2.18 | 7.80 | 0.86 | 3.45 | 1480 |
| 4 | 13.24 | 2.59 | 2.87 | 21.0 | 118 | 2.80 | 2.69 | 0.39 | 1.82 | 4.32 | 1.04 | 2.93 | 735 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 173 | 13.71 | 5.65 | 2.45 | 20.5 | 95 | 1.68 | 0.61 | 0.52 | 1.06 | 7.70 | 0.64 | 1.74 | 740 |
| 174 | 13.40 | 3.91 | 2.48 | 23.0 | 102 | 1.80 | 0.75 | 0.43 | 1.41 | 7.30 | 0.70 | 1.56 | 750 |
| 175 | 13.27 | 4.28 | 2.26 | 20.0 | 120 | 1.59 | 0.69 | 0.43 | 1.35 | 10.20 | 0.59 | 1.56 | 835 |
| 176 | 13.17 | 2.59 | 2.37 | 20.0 | 120 | 1.65 | 0.68 | 0.53 | 1.46 | 9.30 | 0.60 | 1.62 | 840 |
| 177 | 14.13 | 4.10 | 2.74 | 24.5 | 96 | 2.05 | 0.76 | 0.56 | 1.35 | 9.20 | 0.61 | 1.60 | 560 |
178 rows × 13 columns
In [68]:
y
Out[68]:
0 Type1
1 Type1
2 Type1
3 Type1
4 Type1
...
173 Tpye3
174 Tpye3
175 Tpye3
176 Tpye3
177 Tpye3
Name: Class, Length: 178, dtype: object
In [69]:
import numpy as np
#check type quantity
np.unique(y)
Out[69]:
array(['Tpye3', 'Type1', 'Type2'], dtype=object)
In [70]:
# see std (scale) is swing
df.describe()
Out[70]:
| Alcohol | Malic acid | Ash | Alcalinity of ash | Magnesium | Total phenols | Flavanoids | Nonflavanoid phenols | Proanthocyanins | Color intensity | Hue | OD280/OD315 of diluted wines | Proline | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 178.000000 | 178.000000 | 178.000000 | 178.000000 | 178.000000 | 178.000000 | 178.000000 | 178.000000 | 178.000000 | 178.000000 | 178.000000 | 178.000000 | 178.000000 |
| mean | 13.000618 | 2.336348 | 2.366517 | 19.494944 | 99.741573 | 2.295112 | 2.029270 | 0.361854 | 1.590899 | 5.058090 | 0.957449 | 2.611685 | 746.893258 |
| std | 0.811827 | 1.117146 | 0.274344 | 3.339564 | 14.282484 | 0.625851 | 0.998859 | 0.124453 | 0.572359 | 2.318286 | 0.228572 | 0.709990 | 314.907474 |
| min | 11.030000 | 0.740000 | 1.360000 | 10.600000 | 70.000000 | 0.980000 | 0.340000 | 0.130000 | 0.410000 | 1.280000 | 0.480000 | 1.270000 | 278.000000 |
| 25% | 12.362500 | 1.602500 | 2.210000 | 17.200000 | 88.000000 | 1.742500 | 1.205000 | 0.270000 | 1.250000 | 3.220000 | 0.782500 | 1.937500 | 500.500000 |
| 50% | 13.050000 | 1.865000 | 2.360000 | 19.500000 | 98.000000 | 2.355000 | 2.135000 | 0.340000 | 1.555000 | 4.690000 | 0.965000 | 2.780000 | 673.500000 |
| 75% | 13.677500 | 3.082500 | 2.557500 | 21.500000 | 107.000000 | 2.800000 | 2.875000 | 0.437500 | 1.950000 | 6.200000 | 1.120000 | 3.170000 | 985.000000 |
| max | 14.830000 | 5.800000 | 3.230000 | 30.000000 | 162.000000 | 3.880000 | 5.080000 | 0.660000 | 3.580000 | 13.000000 | 1.710000 | 4.000000 | 1680.000000 |
A.3.2 Adjust data scaling¶
In [71]:
# std is near 1 , we can use that
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x = scaler.fit_transform(x)
pd.DataFrame(x).describe()
Out[71]:
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1.780000e+02 | 1.780000e+02 | 1.780000e+02 | 1.780000e+02 | 1.780000e+02 | 178.000000 | 1.780000e+02 | 1.780000e+02 | 1.780000e+02 | 1.780000e+02 | 1.780000e+02 | 1.780000e+02 | 1.780000e+02 |
| mean | -8.382808e-16 | -1.197544e-16 | -8.370333e-16 | -3.991813e-17 | -3.991813e-17 | 0.000000 | -3.991813e-16 | 3.592632e-16 | -1.197544e-16 | 2.494883e-17 | 1.995907e-16 | 3.193450e-16 | -1.596725e-16 |
| std | 1.002821e+00 | 1.002821e+00 | 1.002821e+00 | 1.002821e+00 | 1.002821e+00 | 1.002821 | 1.002821e+00 | 1.002821e+00 | 1.002821e+00 | 1.002821e+00 | 1.002821e+00 | 1.002821e+00 | 1.002821e+00 |
| min | -2.434235e+00 | -1.432983e+00 | -3.679162e+00 | -2.671018e+00 | -2.088255e+00 | -2.107246 | -1.695971e+00 | -1.868234e+00 | -2.069034e+00 | -1.634288e+00 | -2.094732e+00 | -1.895054e+00 | -1.493188e+00 |
| 25% | -7.882448e-01 | -6.587486e-01 | -5.721225e-01 | -6.891372e-01 | -8.244151e-01 | -0.885468 | -8.275393e-01 | -7.401412e-01 | -5.972835e-01 | -7.951025e-01 | -7.675624e-01 | -9.522483e-01 | -7.846378e-01 |
| 50% | 6.099988e-02 | -4.231120e-01 | -2.382132e-02 | 1.518295e-03 | -1.222817e-01 | 0.095960 | 1.061497e-01 | -1.760948e-01 | -6.289785e-02 | -1.592246e-01 | 3.312687e-02 | 2.377348e-01 | -2.337204e-01 |
| 75% | 8.361286e-01 | 6.697929e-01 | 6.981085e-01 | 6.020883e-01 | 5.096384e-01 | 0.808997 | 8.490851e-01 | 6.095413e-01 | 6.291754e-01 | 4.939560e-01 | 7.131644e-01 | 7.885875e-01 | 7.582494e-01 |
| max | 2.259772e+00 | 3.109192e+00 | 3.156325e+00 | 3.154511e+00 | 4.371372e+00 | 2.539515 | 3.062832e+00 | 2.402403e+00 | 3.485073e+00 | 3.435432e+00 | 3.301694e+00 | 1.960915e+00 | 2.971473e+00 |
A.3.3 Split data set to training data and testing data¶
In [72]:
# for test 30% , train 70%
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(x,y, test_size=0.3)
A.3.4 Build the linear regressing model¶
In [73]:
from sklearn.linear_model import LogisticRegression
logmodel = LogisticRegression()
logmodel.fit(xtrain, ytrain)
Out[73]:
LogisticRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression()
A.3.5 Compare between original data and the result of prediction from the model¶
In [74]:
ypred = logmodel.predict(xtest)
A.3.6 Evaluate the model with confusion metrix¶
In [75]:
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(ytest, ypred))
[[16 0 0] [ 0 19 0] [ 0 0 19]]
In [76]:
print(classification_report(ytest,ypred))
precision recall f1-score support
Tpye3 1.00 1.00 1.00 16
Type1 1.00 1.00 1.00 19
Type2 1.00 1.00 1.00 19
accuracy 1.00 54
macro avg 1.00 1.00 1.00 54
weighted avg 1.00 1.00 1.00 54
In [77]:
import pandas as pd
df=pd.read_csv('kk.csv')
df.head()
Out[77]:
| Kyphosis | Age | Number | Start | |
|---|---|---|---|---|
| 0 | absent | 71 | 3 | 5 |
| 1 | absent | 158 | 3 | 14 |
| 2 | present | 128 | 4 | 5 |
| 3 | absent | 2 | 5 | 1 |
| 4 | absent | 1 | 4 | 15 |
In [78]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 81 entries, 0 to 80 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Kyphosis 81 non-null object 1 Age 81 non-null int64 2 Number 81 non-null int64 3 Start 81 non-null int64 dtypes: int64(3), object(1) memory usage: 2.7+ KB
In [79]:
df.describe()
Out[79]:
| Age | Number | Start | |
|---|---|---|---|
| count | 81.000000 | 81.000000 | 81.000000 |
| mean | 83.654321 | 4.049383 | 11.493827 |
| std | 58.104251 | 1.619423 | 4.883962 |
| min | 1.000000 | 2.000000 | 1.000000 |
| 25% | 26.000000 | 3.000000 | 9.000000 |
| 50% | 87.000000 | 4.000000 | 13.000000 |
| 75% | 130.000000 | 5.000000 | 16.000000 |
| max | 206.000000 | 10.000000 | 18.000000 |
In [80]:
# plot for see relation ,Can't see how they are related.
import seaborn as sns
sns.pairplot(df)
Out[80]:
<seaborn.axisgrid.PairGrid at 0x257cb5c93d0>
In [81]:
sns.pairplot(df,hue='Kyphosis')
Out[81]:
<seaborn.axisgrid.PairGrid at 0x257cbafa3d0>
B.2 Split data set to training data and testing data¶
In [82]:
from sklearn.model_selection import train_test_split
# create data to two part (train and test)
x=df.drop('Kyphosis',axis=1)
y=df['Kyphosis']
xtrain, xtest, ytrain, ytest =train_test_split(x,y, test_size =0.3)
B.3 Build the tree¶
In [83]:
from sklearn.tree import DecisionTreeClassifier
In [84]:
from sklearn.tree import DecisionTreeClassifier
# let deep of tree if don't set is a lot deep
dtree =DecisionTreeClassifier(max_depth=2)
In [85]:
dtree.fit(xtrain, ytrain)
Out[85]:
DecisionTreeClassifier(max_depth=2)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier(max_depth=2)
In [86]:
from sklearn import tree
tree.plot_tree(dtree)
Out[86]:
[Text(0.5, 0.8333333333333334, 'x[2] <= 8.5\ngini = 0.337\nsamples = 56\nvalue = [44, 12]'), Text(0.25, 0.5, 'x[0] <= 72.0\ngini = 0.444\nsamples = 12\nvalue = [4, 8]'), Text(0.375, 0.6666666666666667, 'True '), Text(0.125, 0.16666666666666666, 'gini = 0.444\nsamples = 6\nvalue = [4, 2]'), Text(0.375, 0.16666666666666666, 'gini = 0.0\nsamples = 6\nvalue = [0, 6]'), Text(0.75, 0.5, 'x[2] <= 13.5\ngini = 0.165\nsamples = 44\nvalue = [40, 4]'), Text(0.625, 0.6666666666666667, ' False'), Text(0.625, 0.16666666666666666, 'gini = 0.308\nsamples = 21\nvalue = [17, 4]'), Text(0.875, 0.16666666666666666, 'gini = 0.0\nsamples = 23\nvalue = [23, 0]')]
In [87]:
from sklearn.tree import export_text
r=export_text(dtree,feature_names=['age','num','start'])
In [88]:
print(r)
|--- start <= 8.50 | |--- age <= 72.00 | | |--- class: absent | |--- age > 72.00 | | |--- class: present |--- start > 8.50 | |--- start <= 13.50 | | |--- class: absent | |--- start > 13.50 | | |--- class: absent
B.4 Evaluate the model with confusion metrix¶
In [89]:
pred=dtree.predict(xtest)
In [90]:
# compare
ytest==pred
Out[90]:
57 True 80 True 3 True 69 True 10 False 14 True 17 True 37 False 64 True 13 True 21 True 20 True 72 True 38 True 25 True 42 False 41 True 62 False 1 True 27 True 7 True 39 False 16 True 23 False 18 True Name: Kyphosis, dtype: bool
In [91]:
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(ytest,pred))
[[17 3] [ 3 2]]
In [92]:
print(classification_report(ytest,pred))
precision recall f1-score support
absent 0.85 0.85 0.85 20
present 0.40 0.40 0.40 5
accuracy 0.76 25
macro avg 0.62 0.62 0.62 25
weighted avg 0.76 0.76 0.76 25
In [93]:
import pandas as pd
df = pd.read_csv("Classified Data",index_col=0) # let column index
df.head()
Out[93]:
| WTT | PTI | EQW | SBI | LQE | QWG | FDJ | PJF | HQE | NXJ | TARGET CLASS | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.913917 | 1.162073 | 0.567946 | 0.755464 | 0.780862 | 0.352608 | 0.759697 | 0.643798 | 0.879422 | 1.231409 | 1 |
| 1 | 0.635632 | 1.003722 | 0.535342 | 0.825645 | 0.924109 | 0.648450 | 0.675334 | 1.013546 | 0.621552 | 1.492702 | 0 |
| 2 | 0.721360 | 1.201493 | 0.921990 | 0.855595 | 1.526629 | 0.720781 | 1.626351 | 1.154483 | 0.957877 | 1.285597 | 0 |
| 3 | 1.234204 | 1.386726 | 0.653046 | 0.825624 | 1.142504 | 0.875128 | 1.409708 | 1.380003 | 1.522692 | 1.153093 | 1 |
| 4 | 1.279491 | 0.949750 | 0.627280 | 0.668976 | 1.232537 | 0.703727 | 1.115596 | 0.646691 | 1.463812 | 1.419167 | 1 |
C.2 Show basic information¶
In [94]:
df.info()
<class 'pandas.core.frame.DataFrame'> Index: 1000 entries, 0 to 999 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 WTT 1000 non-null float64 1 PTI 1000 non-null float64 2 EQW 1000 non-null float64 3 SBI 1000 non-null float64 4 LQE 1000 non-null float64 5 QWG 1000 non-null float64 6 FDJ 1000 non-null float64 7 PJF 1000 non-null float64 8 HQE 1000 non-null float64 9 NXJ 1000 non-null float64 10 TARGET CLASS 1000 non-null int64 dtypes: float64(10), int64(1) memory usage: 93.8 KB
In [95]:
df.describe()
Out[95]:
| WTT | PTI | EQW | SBI | LQE | QWG | FDJ | PJF | HQE | NXJ | TARGET CLASS | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 | 1000.00000 |
| mean | 0.949682 | 1.114303 | 0.834127 | 0.682099 | 1.032336 | 0.943534 | 0.963422 | 1.071960 | 1.158251 | 1.362725 | 0.50000 |
| std | 0.289635 | 0.257085 | 0.291554 | 0.229645 | 0.243413 | 0.256121 | 0.255118 | 0.288982 | 0.293738 | 0.204225 | 0.50025 |
| min | 0.174412 | 0.441398 | 0.170924 | 0.045027 | 0.315307 | 0.262389 | 0.295228 | 0.299476 | 0.365157 | 0.639693 | 0.00000 |
| 25% | 0.742358 | 0.942071 | 0.615451 | 0.515010 | 0.870855 | 0.761064 | 0.784407 | 0.866306 | 0.934340 | 1.222623 | 0.00000 |
| 50% | 0.940475 | 1.118486 | 0.813264 | 0.676835 | 1.035824 | 0.941502 | 0.945333 | 1.065500 | 1.165556 | 1.375368 | 0.50000 |
| 75% | 1.163295 | 1.307904 | 1.028340 | 0.834317 | 1.198270 | 1.123060 | 1.134852 | 1.283156 | 1.383173 | 1.504832 | 1.00000 |
| max | 1.721779 | 1.833757 | 1.722725 | 1.634884 | 1.650050 | 1.666902 | 1.713342 | 1.785420 | 1.885690 | 1.893950 | 1.00000 |
C.3 Visualize data¶
In [96]:
import seaborn as sns
sns.pairplot(df[['FDJ','PJF','HQE','NXJ','TARGET CLASS']] , hue='TARGET CLASS') # Separate taket colors
Out[96]:
<seaborn.axisgrid.PairGrid at 0x257cc0ab450>
C.4 Prepare data¶
In [97]:
x = df.drop('TARGET CLASS',axis=1) # x keep independent variable
y = df.loc[:, 'TARGET CLASS']# y keep dependent variable
In [98]:
x
Out[98]:
| WTT | PTI | EQW | SBI | LQE | QWG | FDJ | PJF | HQE | NXJ | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.913917 | 1.162073 | 0.567946 | 0.755464 | 0.780862 | 0.352608 | 0.759697 | 0.643798 | 0.879422 | 1.231409 |
| 1 | 0.635632 | 1.003722 | 0.535342 | 0.825645 | 0.924109 | 0.648450 | 0.675334 | 1.013546 | 0.621552 | 1.492702 |
| 2 | 0.721360 | 1.201493 | 0.921990 | 0.855595 | 1.526629 | 0.720781 | 1.626351 | 1.154483 | 0.957877 | 1.285597 |
| 3 | 1.234204 | 1.386726 | 0.653046 | 0.825624 | 1.142504 | 0.875128 | 1.409708 | 1.380003 | 1.522692 | 1.153093 |
| 4 | 1.279491 | 0.949750 | 0.627280 | 0.668976 | 1.232537 | 0.703727 | 1.115596 | 0.646691 | 1.463812 | 1.419167 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 995 | 1.010953 | 1.034006 | 0.853116 | 0.622460 | 1.036610 | 0.586240 | 0.746811 | 0.319752 | 1.117340 | 1.348517 |
| 996 | 0.575529 | 0.955786 | 0.941835 | 0.792882 | 1.414277 | 1.269540 | 1.055928 | 0.713193 | 0.958684 | 1.663489 |
| 997 | 1.135470 | 0.982462 | 0.781905 | 0.916738 | 0.901031 | 0.884738 | 0.386802 | 0.389584 | 0.919191 | 1.385504 |
| 998 | 1.084894 | 0.861769 | 0.407158 | 0.665696 | 1.608612 | 0.943859 | 0.855806 | 1.061338 | 1.277456 | 1.188063 |
| 999 | 0.837460 | 0.961184 | 0.417006 | 0.799784 | 0.934399 | 0.424762 | 0.778234 | 0.907962 | 1.257190 | 1.364837 |
1000 rows × 10 columns
In [99]:
y
Out[99]:
0 1
1 0
2 0
3 1
4 1
..
995 1
996 0
997 1
998 1
999 1
Name: TARGET CLASS, Length: 1000, dtype: int64
In [100]:
# mean is near, but std is swing
x.describe()
Out[100]:
| WTT | PTI | EQW | SBI | LQE | QWG | FDJ | PJF | HQE | NXJ | |
|---|---|---|---|---|---|---|---|---|---|---|
| count | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 |
| mean | 0.949682 | 1.114303 | 0.834127 | 0.682099 | 1.032336 | 0.943534 | 0.963422 | 1.071960 | 1.158251 | 1.362725 |
| std | 0.289635 | 0.257085 | 0.291554 | 0.229645 | 0.243413 | 0.256121 | 0.255118 | 0.288982 | 0.293738 | 0.204225 |
| min | 0.174412 | 0.441398 | 0.170924 | 0.045027 | 0.315307 | 0.262389 | 0.295228 | 0.299476 | 0.365157 | 0.639693 |
| 25% | 0.742358 | 0.942071 | 0.615451 | 0.515010 | 0.870855 | 0.761064 | 0.784407 | 0.866306 | 0.934340 | 1.222623 |
| 50% | 0.940475 | 1.118486 | 0.813264 | 0.676835 | 1.035824 | 0.941502 | 0.945333 | 1.065500 | 1.165556 | 1.375368 |
| 75% | 1.163295 | 1.307904 | 1.028340 | 0.834317 | 1.198270 | 1.123060 | 1.134852 | 1.283156 | 1.383173 | 1.504832 |
| max | 1.721779 | 1.833757 | 1.722725 | 1.634884 | 1.650050 | 1.666902 | 1.713342 | 1.785420 | 1.885690 | 1.893950 |
In [101]:
# scaling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(x)
x_scaled = scaler.transform(x)
x_scaled
Out[101]:
array([[-0.12354188, 0.18590747, -0.91343069, ..., -1.48236813,
-0.9497194 , -0.64331425],
[-1.08483602, -0.43034845, -1.02531333, ..., -0.20224031,
-1.82805088, 0.63675862],
[-0.78870217, 0.33931821, 0.30151137, ..., 0.28570652,
-0.68249379, -0.37784986],
...,
[ 0.64177714, -0.51308341, -0.17920486, ..., -2.36249443,
-0.81426092, 0.11159651],
[ 0.46707241, -0.98278576, -1.46519359, ..., -0.03677699,
0.40602453, -0.85567 ],
[-0.38765353, -0.59589427, -1.4313981 , ..., -0.56778932,
0.3369971 , 0.01034996]])
In [102]:
# mean is near 0 it' ok and std near 1
pd.DataFrame(x_scaled).describe()
Out[102]:
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | |
|---|---|---|---|---|---|---|---|---|---|---|
| count | 1.000000e+03 | 1.000000e+03 | 1.000000e+03 | 1.000000e+03 | 1.000000e+03 | 1.000000e+03 | 1.000000e+03 | 1.000000e+03 | 1.000000e+03 | 1.000000e+03 |
| mean | 1.119105e-16 | -2.939871e-16 | -1.203482e-16 | -1.882938e-16 | -6.057377e-16 | 3.552714e-17 | 2.255973e-16 | -4.760636e-16 | 3.197442e-16 | 4.503065e-16 |
| std | 1.000500e+00 | 1.000500e+00 | 1.000500e+00 | 1.000500e+00 | 1.000500e+00 | 1.000500e+00 | 1.000500e+00 | 1.000500e+00 | 1.000500e+00 | 1.000500e+00 |
| min | -2.678050e+00 | -2.618747e+00 | -2.275858e+00 | -2.775551e+00 | -2.947206e+00 | -2.660802e+00 | -2.620466e+00 | -2.674465e+00 | -2.701361e+00 | -3.542140e+00 |
| 25% | -7.161683e-01 | -6.702761e-01 | -7.504105e-01 | -7.279635e-01 | -6.637361e-01 | -7.127975e-01 | -7.020467e-01 | -7.120098e-01 | -7.626629e-01 | -6.863610e-01 |
| 50% | -3.180217e-02 | 1.628137e-02 | -7.159299e-02 | -2.293699e-02 | 1.433731e-02 | -7.940354e-03 | -7.093937e-02 | -2.236584e-02 | 2.488297e-02 | 6.194010e-02 |
| 75% | 7.378939e-01 | 7.534412e-01 | 6.664646e-01 | 6.631695e-01 | 6.820374e-01 | 7.012930e-01 | 6.723000e-01 | 7.311915e-01 | 7.661087e-01 | 6.961851e-01 |
| max | 2.667092e+00 | 2.799904e+00 | 3.049325e+00 | 4.151021e+00 | 2.538987e+00 | 2.825739e+00 | 2.940974e+00 | 2.470109e+00 | 2.477734e+00 | 2.602476e+00 |
C.5 Split data¶
In [103]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_scaled,y, test_size=0.30)
C.6 Train model¶
In [104]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=1) # สร้าง object ใส่ n_neighbors=1 จุด คือดูว่าเพื่อนบ้านใกล้ๆเป็นใคร เเล้วสรุปเลย ดูจากจุดใกล้สุดเเค่จุดเดียว
knn.fit(x_train,y_train)
Out[104]:
KNeighborsClassifier(n_neighbors=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
KNeighborsClassifier(n_neighbors=1)
C.7 Predict¶
In [105]:
y_pred = knn.predict(x_test)
y_pred
Out[105]:
array([0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1,
1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1,
0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1,
1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1,
0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1,
1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0,
1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1,
1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1,
0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1,
0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1,
0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1,
0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1])
In [106]:
y_test
Out[106]:
448 0
69 0
901 0
638 0
139 0
..
117 1
805 0
877 0
501 0
111 1
Name: TARGET CLASS, Length: 300, dtype: int64
In [107]:
y_test == y_pred
Out[107]:
448 True
69 False
901 True
638 True
139 True
...
117 True
805 True
877 True
501 True
111 True
Name: TARGET CLASS, Length: 300, dtype: bool
In [108]:
# check % of prediction
import numpy as np
np.mean(y_test == y_pred)
Out[108]:
np.float64(0.9266666666666666)
In [109]:
# calculate error
import numpy as np
np.mean(y_test != y_pred)
Out[109]:
np.float64(0.07333333333333333)
In [110]:
# evaluate
from sklearn.metrics import classification_report,confusion_matrix
print( confusion_matrix( y_test , y_pred ) ) # use print for read easy
print( classification_report( y_test , y_pred ) )
[[143 15]
[ 7 135]]
precision recall f1-score support
0 0.95 0.91 0.93 158
1 0.90 0.95 0.92 142
accuracy 0.93 300
macro avg 0.93 0.93 0.93 300
weighted avg 0.93 0.93 0.93 300
In [111]:
# class 0 correct 130 incorrect 15
# class 1 correct 142 incorrect 13
C.8 Change the number of N¶
In [112]:
# Train model again (use N = 23)
knn = KNeighborsClassifier(n_neighbors=23)
knn.fit(x_train,y_train)
Out[112]:
KNeighborsClassifier(n_neighbors=23)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
KNeighborsClassifier(n_neighbors=23)
In [113]:
# Predict again
y_pred_new = knn.predict(x_test)
In [114]:
# Write performance of both N=1 and N=23 to compare
print('WITH K=1')
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(np.mean(y_test != y_pred), '\n')
print('***************************************************')
print('WITH K=23')
print(confusion_matrix(y_test,y_pred_new),)
print(classification_report(y_test,y_pred_new))
print(np.mean(y_test != y_pred_new))
WITH K=1
[[143 15]
[ 7 135]]
precision recall f1-score support
0 0.95 0.91 0.93 158
1 0.90 0.95 0.92 142
accuracy 0.93 300
macro avg 0.93 0.93 0.93 300
weighted avg 0.93 0.93 0.93 300
0.07333333333333333
***************************************************
WITH K=23
[[144 14]
[ 4 138]]
precision recall f1-score support
0 0.97 0.91 0.94 158
1 0.91 0.97 0.94 142
accuracy 0.94 300
macro avg 0.94 0.94 0.94 300
weighted avg 0.94 0.94 0.94 300
0.06
C.9 Iteratively run the model for several N¶
In [115]:
error_rate = []
for i in range(1,70):
knn = KNeighborsClassifier(n_neighbors=i)
knn.fit(x_train,y_train)
pred_i = knn.predict(x_test)
error_rate.append(np.mean(pred_i != y_test))
error_rate
Out[115]:
[np.float64(0.07333333333333333), np.float64(0.06666666666666667), np.float64(0.06666666666666667), np.float64(0.05333333333333334), np.float64(0.06), np.float64(0.06333333333333334), np.float64(0.06666666666666667), np.float64(0.07), np.float64(0.07), np.float64(0.06333333333333334), np.float64(0.06333333333333334), np.float64(0.06333333333333334), np.float64(0.06666666666666667), np.float64(0.06666666666666667), np.float64(0.06666666666666667), np.float64(0.06666666666666667), np.float64(0.07333333333333333), np.float64(0.07), np.float64(0.07), np.float64(0.07), np.float64(0.06666666666666667), np.float64(0.06666666666666667), np.float64(0.06), np.float64(0.06333333333333334), np.float64(0.06333333333333334), np.float64(0.06), np.float64(0.06666666666666667), np.float64(0.06666666666666667), np.float64(0.06666666666666667), np.float64(0.06666666666666667), np.float64(0.06666666666666667), np.float64(0.06666666666666667), np.float64(0.06333333333333334), np.float64(0.06666666666666667), np.float64(0.06666666666666667), np.float64(0.06666666666666667), np.float64(0.06666666666666667), np.float64(0.06666666666666667), np.float64(0.06666666666666667), np.float64(0.06666666666666667), np.float64(0.06666666666666667), np.float64(0.06333333333333334), np.float64(0.06333333333333334), np.float64(0.06333333333333334), np.float64(0.06333333333333334), np.float64(0.06666666666666667), np.float64(0.06333333333333334), np.float64(0.06666666666666667), np.float64(0.06666666666666667), np.float64(0.07), np.float64(0.06666666666666667), np.float64(0.06666666666666667), np.float64(0.06333333333333334), np.float64(0.06666666666666667), np.float64(0.06666666666666667), np.float64(0.06666666666666667), np.float64(0.06333333333333334), np.float64(0.06666666666666667), np.float64(0.06666666666666667), np.float64(0.06666666666666667), np.float64(0.06666666666666667), np.float64(0.07), np.float64(0.07), np.float64(0.07), np.float64(0.06666666666666667), np.float64(0.07), np.float64(0.07), np.float64(0.07333333333333333), np.float64(0.07333333333333333)]
In [116]:
import matplotlib.pyplot as plt
plt.figure(figsize=(10,6))
plt.plot(range(1,70),error_rate,color='blue', linestyle='dashed', marker='o',
markerfacecolor='red', markersize=10)
plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')
Out[116]:
Text(0, 0.5, 'Error Rate')
In [117]:
# จำนวนข้อมูล n_samples=200 อยากให้มี 200 ตัว
#centers=4 แบ่งเป็น 4 คลาส
#cluster_std=1.8 ในเเต่ละกลุ่มมีการกระจายเเค่ไหน
# n_features=2 คอลัมน์ กี่อัน
#random_state=11 ถ้าใส่ 11 มันก็จะออกมาเเบบเดียวกัน ถ้าเราไม่ได้ใส่มันจะไม่เหมือนกัน
from sklearn.datasets import make_blobs
data = make_blobs(n_samples=200, n_features=2,centers=4, cluster_std=1.8, random_state=11)
In [118]:
data #เป็น array เราก็ต้องปรับให้เป็น data frame / class อยู่ตอนท้ายเลย
Out[118]:
(array([[ -6.7049618 , -9.4451861 ],
[ -9.52374112, -2.05466392],
[ -4.42393254, 6.05937796],
[ -0.60779387, 6.157514 ],
[-10.35327546, 2.72511654],
[ -7.98983763, -10.46681546],
[-11.10305315, -2.06045912],
[ -6.72900181, -10.29546072],
[ -7.33616396, -10.06279363],
[ -7.91733607, 1.23668211],
[ -1.59511008, 4.22136064],
[ -4.473001 , -11.52143718],
[-11.26179938, -2.73207462],
[ 1.15451327, -0.27005967],
[ -1.43974111, 7.26027881],
[ -3.03807192, 0.33467293],
[ -7.37052326, -0.72935953],
[ -5.1429573 , -0.39339669],
[ -3.29990809, -1.73975716],
[ -9.42549178, 0.65205694],
[ -3.01561568, -3.08884181],
[ -4.21822327, -11.12801415],
[-11.48516037, -0.84109636],
[ -5.46565496, -8.79078802],
[ -0.36030956, 1.97639025],
[ -0.85201771, -4.01356182],
[ -7.25660601, -2.74711271],
[ -4.81220138, -6.2870384 ],
[ -7.6790141 , -13.04541635],
[ -4.38783971, -8.82789659],
[ -4.43672667, -8.64961608],
[ -7.66234357, -0.15840098],
[ -1.27761095, 1.99260789],
[ -0.41327826, 0.67732941],
[ 0.80544421, 4.41803387],
[ -2.20649226, 0.34271998],
[ -6.36136405, 2.57940645],
[-11.31394338, 0.18887276],
[ -3.18652638, -0.14987313],
[ 0.95418016, 3.28364209],
[ -3.32892431, 7.15640167],
[ 2.25093884, 5.18023182],
[ -8.33895511, -10.38819888],
[ -8.8225386 , 0.60762628],
[ -0.82329405, 7.72343456],
[ -2.40376881, -3.30636624],
[ 0.44161532, 7.73774272],
[ -0.12155328, 4.74757038],
[ -1.82707151, -3.52446161],
[ -9.74980005, -0.67007937],
[-10.48659376, 0.84801083],
[ -7.36053907, -9.04277037],
[ -6.82165702, -11.82871834],
[ -1.21144566, 7.27465518],
[ -1.49960135, 4.25944077],
[-14.1224411 , 0.71038334],
[ 0.42301164, 0.53042185],
[-11.37281992, 0.14935412],
[ -1.93761099, -1.47241167],
[ -6.26543863, -9.12160384],
[ -2.78727964, 4.75392739],
[ -0.96526809, 2.53696837],
[ -1.36259389, -0.18860758],
[ -7.52746018, 1.5940934 ],
[ -6.8883157 , -3.1937581 ],
[ -0.85743289, 3.75490103],
[ -1.57184838, 4.77559282],
[ -3.2833845 , -1.20059516],
[ -7.41599229, 0.32095221],
[ 2.14086035, 7.56537787],
[ -9.13659712, 1.05758591],
[ -3.07099018, 3.21174618],
[ 3.22771189, 4.43195816],
[ -3.25912013, -2.62698214],
[ -0.44747116, -2.27505557],
[-10.23520771, -3.15573248],
[ -3.2048221 , 4.44770558],
[ -8.74459354, -11.62555064],
[ -1.90314451, 2.63835164],
[-10.5828186 , 0.82855665],
[ -2.00537472, 4.41365995],
[ -1.89553834, 5.75713209],
[ -3.5275725 , 6.19614335],
[ -1.47624657, 4.41718333],
[ -2.51259226, -11.10348222],
[ -5.32310501, -10.83953808],
[ -1.62974433, 5.40771803],
[ -2.76763939, 4.52927297],
[ -8.37421203, -0.78999494],
[ -9.17895884, -7.27838172],
[ -1.93290099, -0.21872619],
[ 0.49220401, 4.7699782 ],
[ -5.63671493, -11.52858054],
[ -5.06829892, -6.77615384],
[-10.04706527, 2.0803237 ],
[ -1.94623211, -0.96129572],
[ -2.12586003, 1.04000323],
[ 0.49788667, 0.51256397],
[ -4.82091524, -9.48152139],
[ 0.19043387, 4.96256807],
[ -3.49348601, 4.89264202],
[ -1.28829814, -1.60411602],
[ -1.24926748, -1.18467979],
[ -7.25179229, -8.1159364 ],
[-11.81413719, 0.04928025],
[ -3.77273456, 1.44346278],
[ -0.35504329, -0.88044348],
[ -5.86118143, -7.08550716],
[ -6.00337945, -9.39842006],
[ -8.74324075, 0.0320477 ],
[ -1.93113209, 2.3102078 ],
[ 0.19416482, 4.33775203],
[ -9.42600583, -11.74485465],
[ -0.23989749, 4.8340642 ],
[-10.13660985, -0.80235816],
[ -5.32597106, -8.94358377],
[ -1.41392319, 6.15464716],
[ -3.36286128, -1.4914097 ],
[-10.04078525, -0.23054368],
[-11.56627611, 2.68715861],
[ -0.43351252, 1.81149751],
[ -0.97242574, 4.84865115],
[ -6.45054138, -10.8406991 ],
[ -9.34489915, -10.77563991],
[ -6.65001467, -8.91685772],
[ -3.24112917, 1.02146321],
[ 1.95081923, 2.49125159],
[ -4.04740952, -2.3329917 ],
[-10.16434516, -3.58002119],
[ -1.41679007, 3.6185228 ],
[ 2.0482615 , 7.39645176],
[ -3.97292569, -7.78179931],
[ -1.76807254, 0.68603771],
[ -1.6911743 , 2.93292709],
[ -4.07031171, -11.34606391],
[ -7.70284353, 0.817251 ],
[-10.38538526, -1.85455903],
[ -6.76248179, -10.48618635],
[ -1.0940949 , -0.74879869],
[ -1.03117678, 5.14733856],
[ -5.31442261, -8.35847411],
[-12.39794854, 3.50291204],
[ -8.30558415, -10.84256757],
[ -5.157444 , -12.89191242],
[ -0.92489813, 1.64754594],
[ -0.40143561, 7.79767635],
[ -3.59676528, 3.81963485],
[ -7.36416854, -2.70442975],
[-14.25895161, -2.38571695],
[ -5.26046254, -9.47820734],
[ -9.68627204, -1.40976546],
[ 0.25093776, -0.45525409],
[ -6.63544072, -1.55809157],
[ 0.64345965, 2.70942874],
[ -6.71517394, -10.64361338],
[ -9.21398572, -2.13605268],
[ 1.35688045, 2.01673291],
[ -9.23409598, -7.25079662],
[ -7.06425772, -3.64097575],
[ 0.77059209, 1.82399118],
[ -5.6832246 , -9.38933362],
[ 2.7436003 , 0.91024983],
[ -2.81510507, 5.93268791],
[ -3.67529519, -2.02495533],
[ -4.922621 , -9.54426461],
[ -4.84900531, -11.88582779],
[ 1.60245517, -1.08402234],
[ 1.30563883, -1.70019748],
[-12.13831926, 0.54313595],
[-11.82050066, -1.11880334],
[-10.07905389, -2.71684705],
[ -2.09230769, 2.47179007],
[ -7.94864075, -9.84546981],
[ -1.50803215, 3.059446 ],
[ -7.34747587, -6.79942852],
[ -9.89824031, 0.81572832],
[-11.28326142, -1.13483705],
[ -7.7472025 , -0.96318896],
[-11.90488185, 0.8268213 ],
[ -5.15317806, -8.60034925],
[ -1.48640329, -0.63904628],
[ 0.28398684, 0.0238434 ],
[ -0.51948043, 3.24052955],
[ -1.2071594 , 5.54082658],
[ -8.0694534 , 0.30693121],
[ -5.08825222, -6.82216623],
[ -5.07651779, -10.76713088],
[ -1.14314494, -1.70812849],
[ -8.71038967, 1.87543756],
[ 0.18242097, 5.10836367],
[ -0.83563448, -0.23445361],
[ -2.83812809, 0.26969325],
[ -5.84827238, 0.32051354],
[ -4.42247277, -10.16773312],
[ -6.23444648, -9.49588618],
[ -7.82176324, -9.73964411],
[-10.30506257, 2.62070922],
[ -2.84544553, 2.61593681],
[ -7.00200627, 0.26556989],
[ -4.2952364 , 1.26916468]]),
array([0, 3, 1, 1, 3, 0, 3, 0, 0, 3, 1, 0, 3, 2, 1, 2, 3, 2, 2, 3, 2, 0,
3, 0, 2, 2, 3, 0, 0, 0, 0, 3, 2, 2, 1, 2, 2, 3, 2, 1, 1, 1, 0, 3,
1, 2, 1, 1, 2, 3, 3, 0, 0, 1, 1, 3, 2, 3, 2, 0, 1, 2, 2, 3, 3, 1,
1, 2, 3, 1, 3, 1, 1, 2, 2, 3, 1, 0, 2, 3, 1, 1, 1, 1, 0, 0, 1, 1,
3, 0, 2, 1, 0, 0, 3, 2, 2, 2, 0, 1, 1, 2, 2, 0, 3, 1, 2, 0, 0, 3,
2, 1, 0, 1, 3, 0, 1, 2, 3, 3, 2, 1, 0, 0, 0, 2, 1, 2, 3, 1, 1, 0,
2, 1, 0, 3, 3, 0, 2, 1, 0, 3, 0, 0, 1, 1, 1, 3, 3, 0, 3, 2, 3, 1,
0, 3, 1, 0, 3, 2, 0, 1, 1, 2, 0, 0, 2, 2, 3, 3, 3, 2, 0, 2, 0, 3,
3, 3, 3, 0, 2, 2, 1, 1, 3, 0, 0, 2, 3, 1, 2, 2, 2, 0, 0, 0, 3, 1,
3, 2]))
In [119]:
# จัดให้อยู่ในรูป data frame
# data จะมี 2 ส่วน มี independent and dependent
import pandas as pd
df = pd.DataFrame(data[0])
df.head() #independent
Out[119]:
| 0 | 1 | |
|---|---|---|
| 0 | -6.704962 | -9.445186 |
| 1 | -9.523741 | -2.054664 |
| 2 | -4.423933 | 6.059378 |
| 3 | -0.607794 | 6.157514 |
| 4 | -10.353275 | 2.725117 |
In [120]:
# rename
df = df.rename(columns={0:"x1",1:"x2"})
df.head()
Out[120]:
| x1 | x2 | |
|---|---|---|
| 0 | -6.704962 | -9.445186 |
| 1 | -9.523741 | -2.054664 |
| 2 | -4.423933 | 6.059378 |
| 3 | -0.607794 | 6.157514 |
| 4 | -10.353275 | 2.725117 |
In [121]:
#Add class
df["class"] = data[1]
df.head()
Out[121]:
| x1 | x2 | class | |
|---|---|---|---|
| 0 | -6.704962 | -9.445186 | 0 |
| 1 | -9.523741 | -2.054664 | 3 |
| 2 | -4.423933 | 6.059378 | 1 |
| 3 | -0.607794 | 6.157514 | 1 |
| 4 | -10.353275 | 2.725117 | 3 |
In [122]:
# separate x= independent and y = dependent
x=df.drop("class",axis=1)
y= df.iloc[:,2:3]
In [123]:
x
Out[123]:
| x1 | x2 | |
|---|---|---|
| 0 | -6.704962 | -9.445186 |
| 1 | -9.523741 | -2.054664 |
| 2 | -4.423933 | 6.059378 |
| 3 | -0.607794 | 6.157514 |
| 4 | -10.353275 | 2.725117 |
| ... | ... | ... |
| 195 | -7.821763 | -9.739644 |
| 196 | -10.305063 | 2.620709 |
| 197 | -2.845446 | 2.615937 |
| 198 | -7.002006 | 0.265570 |
| 199 | -4.295236 | 1.269165 |
200 rows × 2 columns
In [124]:
y
Out[124]:
| class | |
|---|---|
| 0 | 0 |
| 1 | 3 |
| 2 | 1 |
| 3 | 1 |
| 4 | 3 |
| ... | ... |
| 195 | 0 |
| 196 | 3 |
| 197 | 1 |
| 198 | 3 |
| 199 | 2 |
200 rows × 1 columns
D.2 Visualize Data¶
In [125]:
import matplotlib.pyplot as plt
plt.scatter(x["x1"], x["x2"])
Out[125]:
<matplotlib.collections.PathCollection at 0x257ccfd6750>
In [126]:
import matplotlib.pyplot as plt
plt.scatter(x["x1"], x["x2"],c=y["class"],cmap="rainbow")
Out[126]:
<matplotlib.collections.PathCollection at 0x257ccfe9090>
D.3 Create K-Mean model¶
In [127]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=4) # คลัสเตอร์เราสุ่มมาได้ เเต่เราเเบ่งมาเเล้ว 4 ทีเเรก ถ้าไม่มีีเราอาจจะต้องเดาว่ามันควรมีกี่
kmeans.fit(x) # is unsupervice คือมีเเค่ x1 x2 ไม่ต้อง train test split เพราะมันมีข้อมูลมาน้อย
Out[127]:
KMeans(n_clusters=4)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
KMeans(n_clusters=4)
In [128]:
kmeans.cluster_centers_ # ที่เเบ่งเป็น 4 คลัสเจอร์ จุดศูนย์กลางตรงไหน
Out[128]:
array([[-9.57264985, -0.2908924 ],
[-1.01943537, 4.75629575],
[-6.19917361, -9.70986633],
[-1.5238553 , -0.50445283]])
In [129]:
kmeans.labels_ # จะได้ว่า ตัวที่ 0 1 2 ... จัดอยู่ในคลัสเตอร์ไหน
Out[129]:
array([2, 0, 1, 1, 0, 2, 0, 2, 2, 0, 1, 2, 0, 3, 1, 3, 0, 3, 3, 0, 3, 2,
0, 2, 3, 3, 0, 2, 2, 2, 2, 0, 3, 3, 1, 3, 0, 0, 3, 1, 1, 1, 2, 0,
1, 3, 1, 1, 3, 0, 0, 2, 2, 1, 1, 0, 3, 0, 3, 2, 1, 1, 3, 0, 0, 1,
1, 3, 0, 1, 0, 1, 1, 3, 3, 0, 1, 2, 1, 0, 1, 1, 1, 1, 2, 2, 1, 1,
0, 2, 3, 1, 2, 2, 0, 3, 3, 3, 2, 1, 1, 3, 3, 2, 0, 3, 3, 2, 2, 0,
1, 1, 2, 1, 0, 2, 1, 3, 0, 0, 3, 1, 2, 2, 2, 3, 1, 3, 0, 1, 1, 2,
3, 1, 2, 0, 0, 2, 3, 1, 2, 0, 2, 2, 3, 1, 1, 0, 0, 2, 0, 3, 0, 1,
2, 0, 1, 2, 0, 3, 2, 3, 1, 3, 2, 2, 3, 3, 0, 0, 0, 1, 2, 1, 2, 0,
0, 0, 0, 2, 3, 3, 1, 1, 0, 2, 2, 3, 0, 1, 3, 3, 0, 2, 2, 2, 0, 1,
0, 3], dtype=int32)
In [130]:
# เพื่อที่จะดูว่าเเบ่งเเล้วมันดูดีมั้ย แปลง yเป็น array ดู ว่ามันเหมือนด้านบนมั้ย
import pandas as pd
y_array = np.array(y["class"])
y_array
Out[130]:
array([0, 3, 1, 1, 3, 0, 3, 0, 0, 3, 1, 0, 3, 2, 1, 2, 3, 2, 2, 3, 2, 0,
3, 0, 2, 2, 3, 0, 0, 0, 0, 3, 2, 2, 1, 2, 2, 3, 2, 1, 1, 1, 0, 3,
1, 2, 1, 1, 2, 3, 3, 0, 0, 1, 1, 3, 2, 3, 2, 0, 1, 2, 2, 3, 3, 1,
1, 2, 3, 1, 3, 1, 1, 2, 2, 3, 1, 0, 2, 3, 1, 1, 1, 1, 0, 0, 1, 1,
3, 0, 2, 1, 0, 0, 3, 2, 2, 2, 0, 1, 1, 2, 2, 0, 3, 1, 2, 0, 0, 3,
2, 1, 0, 1, 3, 0, 1, 2, 3, 3, 2, 1, 0, 0, 0, 2, 1, 2, 3, 1, 1, 0,
2, 1, 0, 3, 3, 0, 2, 1, 0, 3, 0, 0, 1, 1, 1, 3, 3, 0, 3, 2, 3, 1,
0, 3, 1, 0, 3, 2, 0, 1, 1, 2, 0, 0, 2, 2, 3, 3, 3, 2, 0, 2, 0, 3,
3, 3, 3, 0, 2, 2, 1, 1, 3, 0, 0, 2, 3, 1, 2, 2, 2, 0, 0, 0, 3, 1,
3, 2])
In [131]:
np.mean(y_array == kmeans.labels_)
Out[131]:
np.float64(0.235)
In [132]:
# y_array == kmeans is about 49% ,why
D.4 Compare with original¶
In [133]:
f, (ax1,ax2) = plt.subplots(1,2,sharey=True,figsize=(10,6))
ax1.set_title("Original")
ax1.scatter(x["x1"],x["x2"],c=y["class"],cmap="rainbow")
ax2.set_title("K Means")
ax2.scatter(x["x1"],x["x2"],c=kmeans.labels_,cmap="rainbow")
Out[133]:
<matplotlib.collections.PathCollection at 0x257cd4d9c50>
In [134]:
# การเเบ่งกลุ่มถ้าไม่มองสีก็ค่อนข้างที่จะตรง เเต่สิ่งที่ต่างคือสี เนื่องจากตอนที่เราเเบ่ง class เราไม่มีชื่อ เราไม่ได้ตั้งชื่อกลุ่ม
# ดังนั้นเวลาเราเทียบ มันจัดกลุ่มถูก เเต่ชื่อมันอาจจะผิดมันระบุชื่อไม่ได้
# เป็นเหตผลว่าทำไมมันถึงได้ 49 %
In [135]:
# plot by set color
f, (ax1 , ax2) = plt.subplots(1 , 2, sharey=True,figsize=(10,6))
cdict = {0 : "black", 1 :"red" , 2 : "blue" , 3 : "green"}
ax1.set_title("Original")
for c in np.unique(y["class"]):
ix = np.where(y["class"]==c)
ax1.scatter(np.array(x["x1"])[ix],np.array(x["x2"])[ix],c= cdict[c] , label = c)
ax1.legend()
ax2.set_title("K means")
for c in np.unique(kmeans.labels_):
ix = np.where(kmeans.labels_==c)
ax2.scatter(np.array(x["x1"])[ix],np.array(x["x2"])[ix],c= cdict[c] , label = c)
ax2.legend()
Out[135]:
<matplotlib.legend.Legend at 0x257cd591990>
In [136]:
from sklearn.datasets import make_blobs
X, y = make_blobs(n_samples=500, n_features=2, centers=4, cluster_std=1, center_box=(-10.0, 10.0), shuffle=True, random_state=1, )
In [137]:
import matplotlib.pyplot as plt # we know x ,dont know y
plt.scatter(X[:,0],X[:,1])
Out[137]:
<matplotlib.collections.PathCollection at 0x257cd861690>
In [138]:
plt.scatter(X[:,0],X[:,1], c=y) # if we see y (but we dont know)
Out[138]:
<matplotlib.collections.PathCollection at 0x257cd8c1cd0>
In [139]:
from sklearn.cluster import KMeans
clusterer = KMeans(n_clusters=4, random_state=10)
cluster_labels = clusterer.fit_predict(X)
plt.scatter(X[:,0],X[:,1], c=cluster_labels)
Out[139]:
<matplotlib.collections.PathCollection at 0x257cd951ed0>
In [140]:
# เเบบนี้ดีมั้ย เราจึงต้องใช้ค่า silhouette ในการวัด -1 - 1 ค่าที่ดีต้องใกล้ ๆ 1
# เป็นการคิดจากจุดแต่ละจุด หรือว่าระยะทางจากจุดแต่ละจุดไปยัง cluster ถ้าห่างมาก็เเปลว่าดี
# ใกล้ 1 เเสดงว่าห่างจากค่าอื่นเยอะ เเล้วก็ใกล้ตัว
from sklearn.metrics import silhouette_samples, silhouette_score
silhouette_avg = silhouette_score(X, cluster_labels)
print( "The average silhouette_score is :", silhouette_avg )
The average silhouette_score is : 0.6505186632729437
In [141]:
import matplotlib.cm as cm
import numpy as np
fig, ax = plt.subplots()
sample_silhouette_values = silhouette_samples(X, cluster_labels)
y_lower = 10
for i in [0,1,2,3]:
ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]
ith_cluster_silhouette_values.sort()
size_cluster_i = ith_cluster_silhouette_values.shape[0]
y_upper = y_lower + size_cluster_i
color = cm.nipy_spectral(float(i) / 4)
ax.fill_betweenx(np.arange(y_lower, y_upper),0,ith_cluster_silhouette_values,facecolor=color,edgecolor=color,alpha=0.7)
ax.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
y_lower = y_upper + 10
ax.axvline(x=silhouette_avg, color="red", linestyle="--")
plt.show()
D.5.2 Iteratively do cluster with different number of clusters¶
In [142]:
import matplotlib.cm as cm
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
range_n_clusters = [2, 3, 4, 5, 6] # เเบ่งกี่คลัสเตอร์
for n_clusters in range_n_clusters:
# Setup plot
fig, (ax1, ax2) = plt.subplots(1, 2)
fig.set_size_inches(18, 7)
ax1.set_xlim([-0.1, 1])
ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10]) # เซตค่ากราผ
# Do clustring การเเบ่งคลัสเตอร์ด้วยเคมีน
clusterer = KMeans(n_clusters=n_clusters, random_state=10)
cluster_labels = clusterer.fit_predict(X)
# Calculate the silhouette_score
silhouette_avg = silhouette_score(X, cluster_labels)
print( "For n_clusters =", n_clusters, "The average silhouette_score is :", silhouette_avg )
# 1st plot shows silhouette score พล็อตค่า ว่า 23456 หน้าตาเป็นยังไง
sample_silhouette_values = silhouette_samples(X, cluster_labels)
y_lower = 10
for i in range(n_clusters):
ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]
ith_cluster_silhouette_values.sort()
size_cluster_i = ith_cluster_silhouette_values.shape[0]
y_upper = y_lower + size_cluster_i
color = cm.nipy_spectral(float(i) / n_clusters)
ax1.fill_betweenx(np.arange(y_lower, y_upper),0,ith_cluster_silhouette_values,facecolor=color,edgecolor=color,alpha=0.7)
ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
y_lower = y_upper + 10
ax1.set_title("The silhouette plot for the various clusters.")
ax1.set_xlabel("The silhouette coefficient values")
ax1.set_ylabel("Cluster label")
# Plot average silhouette score as vertical line พล็อตเรดไลน์ ค่าเฉลี่ยน
ax1.axvline(x=silhouette_avg, color="red", linestyle="--")
ax1.set_yticks([]) # Clear the yaxis labels / ticks
ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
# 2nd plot shows the clusters เเสดงปลการเเบ่งคลัสเตอร์
colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters)
ax2.scatter(X[:, 0], X[:, 1], marker=".", s=30, lw=0, alpha=0.7, c=colors, edgecolor="k")
centers = clusterer.cluster_centers_
ax2.scatter(centers[:, 0],centers[:, 1],marker="o",c="white",alpha=1,s=200,edgecolor="k",)
for i, c in enumerate(centers):
ax2.scatter(c[0], c[1], marker="$%d$" % i, alpha=1, s=50, edgecolor="k")
ax2.set_title("The visualization of the clustered data.")
ax2.set_xlabel("Feature space for the 1st feature")
ax2.set_ylabel("Feature space for the 2nd feature")
plt.suptitle("Silhouette analysis for KMeans clustering on sample data with n_clusters = %d" % n_clusters,fontsize=14,fontweight="bold")
plt.show()
For n_clusters = 2 The average silhouette_score is : 0.7049787496083262 For n_clusters = 3 The average silhouette_score is : 0.5882004012129721 For n_clusters = 4 The average silhouette_score is : 0.6505186632729437 For n_clusters = 5 The average silhouette_score is : 0.561464362648773 For n_clusters = 6 The average silhouette_score is : 0.4857596147013469
In [143]:
from sklearn.datasets import make_blobs
X, y = make_blobs(n_samples=300, centers=4, cluster_std=0.60, random_state=0)
In [144]:
import matplotlib.pyplot as plt
plt.scatter(X[:, 0], X[:, 1],s=10);
In [145]:
import matplotlib.pyplot as plt
plt.scatter(X[:, 0], X[:, 1], c=y ,s=10);
D.6.2 Manually do clustering step-by-step¶
In [146]:
import numpy as np
from sklearn.metrics import pairwise_distances_argmin
n_clusters = 4
rseed = 1 # ถ้าลองเปลี่ยน จะทำให้คลัสเตอร์เปลี่ยน
# 1. Randomly choose clusters
rng = np.random.RandomState(rseed)
i = rng.permutation(X.shape[0])[:n_clusters]
centers = X[i]
In [147]:
plt.scatter(X[:, 0], X[:, 1],s=10);
plt.scatter(centers[:, 0], centers[:, 1],c='red', s=30);
In [148]:
# 2. Assign labels based on closest center
labels = pairwise_distances_argmin(X, centers)
# 3. Find new centers from means of points
new_centers = np.array([X[labels == i].mean(0) for i in range(n_clusters)])
centers = new_centers
plt.scatter(X[:, 0], X[:, 1], c=labels, s=50, cmap='viridis');
plt.scatter(centers[:, 0], centers[:, 1], c='red', s=100, alpha=0.5);
In [149]:
# หลังจากที่เรา assign จุดแต่ละจุดเข้ากับ cluster พร้อมกับหา centroid ใหม่
# จะได้ดังภาพนะครับ เป็นจุดของ cluster 0 cluster 1, 2, 3 ตามรูป
# เพราะฉะนั้น loop แรกเราก็ run ใช้เทคนิคเดิมซ้ำ
# พอเราได้ตัว centroid ใหม่เราก็มาเช็กใหม่ว่าข้อมูลแต่ละจุดมันห่างจาก cluster ไหนมากกว่ากัน
# ถ้ามันอยู่ใกล้ cluster ไหนเนี่ย ก็จะถูก assign ใหม่ให้ไปอยู่ในนั้น cluster นั้น
# ลอง run ดูอีก 1 รอบ มันจะเริ่มเปลี่ยน จุดมันจะเริ่มย้ายแล้ว
# run รอบที่ 3 รอบที่ 4 รอบที่ 5 สังเกตว่าตอนนี้ มันไม่เปลี่ยนแล้ว
# แสดงว่าตอนนี้มันถึงเรียกว่าเป็น optimal ดีที่สุดเเล้ว
D.6.3 Run clustering with changing random state¶
ลดปัญหายังไง รันหลายๆรอบ เเล้วเปลี่ยนครัสเตอร์ไปหลายๆเเบบ เเล่้วเลือกเอาเเบบที่ดีสุด เพื่อแก้ปัญหา local optimal ได้
In [150]:
# Create clustering function
def find_clusters(X, n_clusters, rseed):
# 1. Randomly choose clusters
rng = np.random.RandomState(rseed)
i = rng.permutation(X.shape[0])[:n_clusters]
centers = X[i]
while True:
# 2. Assign labels based on closest center
labels = pairwise_distances_argmin(X, centers)
# 3. Find new centers from means of points
new_centers = np.array([X[labels == i].mean(0) for i in range(n_clusters)])
# 4. If the centers are not chage, stop
if np.all(centers == new_centers):
break
centers = new_centers
return centers, labels
In [151]:
# iteratively do clustering and plot graph
import matplotlib.cm as cm
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_samples, silhouette_score
n_clusters = 4
range_rand_state = [0, 1, 2, 3, 4] # ลองเปลี่ยน random seed เเทน ก่อนหน้าเคยลองเปลี่ยน clusters แล้ว
for rand_state in range_rand_state:
# Setup plot
fig, (ax1, ax2) = plt.subplots(1, 2)
fig.set_size_inches(18, 7)
ax1.set_xlim([-0.1, 1])
ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10])
# Do clustring
centers, cluster_labels = find_clusters(X, n_clusters, rand_state)
# Calculate the silhouette_score
silhouette_avg = silhouette_score(X, cluster_labels)
print( "For random state =", rand_state, "The average silhouette_score is :", silhouette_avg )
sample_silhouette_values = silhouette_samples(X, cluster_labels)
# 1st plot shows silhouette score
y_lower = 10
for i in range(n_clusters):
ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]
ith_cluster_silhouette_values.sort()
size_cluster_i = ith_cluster_silhouette_values.shape[0]
y_upper = y_lower + size_cluster_i
color = cm.nipy_spectral(float(i) / n_clusters)
ax1.fill_betweenx(np.arange(y_lower, y_upper),0,ith_cluster_silhouette_values,facecolor=color,edgecolor=color,alpha=0.7)
ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
y_lower = y_upper + 10
ax1.set_title("The silhouette plot for the various clusters.")
ax1.set_xlabel("The silhouette coefficient values")
ax1.set_ylabel("Cluster label")
# Plot average silhouette score as vertical line
ax1.axvline(x=silhouette_avg, color="red", linestyle="--")
ax1.set_yticks([]) # Clear the yaxis labels / ticks
ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
# 2nd plot shows the clusters
colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters)
ax2.scatter(X[:, 0], X[:, 1], marker=".", s=30, lw=0, alpha=0.7, c=colors, edgecolor="k")
# #
ax2.scatter(centers[:, 0],centers[:, 1],marker="o",c="white",alpha=1,s=200,edgecolor="k",)
for i, c in enumerate(centers):
ax2.scatter(c[0], c[1], marker="$%d$" % i, alpha=1, s=50, edgecolor="k")
ax2.set_title("The visualization of the clustered data.")
ax2.set_xlabel("Feature space for the 1st feature")
ax2.set_ylabel("Feature space for the 2nd feature")
plt.suptitle("Silhouette analysis for KMeans clustering on sample data with random state = %d" % rand_state,fontsize=14,fontweight="bold")
plt.show()
For random state = 0 The average silhouette_score is : 0.4525003649329323 For random state = 1 The average silhouette_score is : 0.6819938690643478 For random state = 2 The average silhouette_score is : 0.6819938690643478 For random state = 3 The average silhouette_score is : 0.48890511771605344 For random state = 4 The average silhouette_score is : 0.6819938690643478
In [152]:
import numpy as np
from sklearn.datasets import make_blobs
X, y = make_blobs(n_samples=400, centers=4, cluster_std=0.60, random_state=0)
X = X[:, ::-1]
rng = np.random.RandomState(13)
X = np.dot(X, rng.randn(2, 2))
In [153]:
import matplotlib.pyplot as plt
plt.scatter(X[:, 0], X[:, 1], c=y, s=40);
Do normal clustering (use k mean)¶
In [154]:
from sklearn.cluster import KMeans
kmeans = KMeans(4, random_state=0)
kmeans.fit(X)
labels = kmeans.predict(X)
In [155]:
plt.scatter(X[:, 0], X[:, 1], c=labels, s=40);
Build function to draw the cluster boundary¶
In [156]:
from scipy.spatial.distance import cdist
def plot_kmeans(kmeans, X, n_clusters=4, rseed=0, ax=None):
labels = kmeans.fit_predict(X)
# plot the input data
ax = ax or plt.gca()
ax.axis('equal')
ax.scatter(X[:, 0], X[:, 1], c=labels, s=40, cmap='viridis', zorder=2)
# plot the representation of the KMeans model
centers = kmeans.cluster_centers_
radii = [cdist(X[labels == i], [center]).max() for i, center in enumerate(centers)]
for c, r in zip(centers, radii):
ax.add_patch(plt.Circle(c, r, fc='#CCCCCC', lw=3, alpha=0.5, zorder=1))
In [157]:
plot_kmeans(kmeans, X)
In [158]:
#ความถูกต้องพอได้ เเต่อยากจะให้ถูกกว่านี้
In [159]:
from sklearn.mixture import GaussianMixture
gmm = GaussianMixture(n_components=4,random_state=0) # เเบ่งเแป็นกลุ่ม 4 กลุ่ม
gmm.fit(X)
labels = gmm.predict(X)
In [160]:
plt.scatter(X[:, 0], X[:, 1], c=y, s=40, cmap='viridis'); # original
In [161]:
plt.scatter(X[:, 0], X[:, 1], c=labels, s=40, cmap='viridis'); # ถูกมากขึ้นถ้าเทียบกับของตัว k means
In [162]:
from matplotlib.patches import Ellipse
def draw_ellipse(position, covariance, ax=None, **kwargs):
ax = ax or plt.gca()
if covariance.shape == (2, 2):
U, s, Vt = np.linalg.svd(covariance)
angle = np.degrees(np.arctan2(U[1, 0], U[0, 0]))
width, height = 2 * np.sqrt(s)
else:
angle = 0
width, height = 2 * np.sqrt(covariance)
for nsig in range(1, 4):
ax.add_patch(Ellipse(position, nsig * width, nsig * height,angle, **kwargs))
def plot_gmm(gmm, X, label=True, ax=None):
ax = ax or plt.gca()
labels = gmm.fit(X).predict(X)
if label:
ax.scatter(X[:, 0], X[:, 1], c=labels, s=40, cmap='viridis', zorder=2)
else:
ax.scatter(X[:, 0], X[:, 1], s=40, zorder=2)
ax.axis('equal')
w_factor = 0.2 / gmm.weights_.max()
for pos, covar, w in zip(gmm.means_, gmm.covariances_, gmm.weights_):
draw_ellipse(pos, covar, alpha=w * w_factor)
In [163]:
plot_gmm(gmm, X)
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) Cell In[163], line 1 ----> 1 plot_gmm(gmm, X) Cell In[162], line 23, in plot_gmm(gmm, X, label, ax) 21 w_factor = 0.2 / gmm.weights_.max() 22 for pos, covar, w in zip(gmm.means_, gmm.covariances_, gmm.weights_): ---> 23 draw_ellipse(pos, covar, alpha=w * w_factor) Cell In[162], line 12, in draw_ellipse(position, covariance, ax, **kwargs) 10 width, height = 2 * np.sqrt(covariance) 11 for nsig in range(1, 4): ---> 12 ax.add_patch(Ellipse(position, nsig * width, nsig * height,angle, **kwargs)) TypeError: Ellipse.__init__() takes 4 positional arguments but 5 were given
สรุปคือถ้าพล็อตดูด้วยตาเเล้ว ทรงมันน่าจะเป็นเเบบไหนให้ใช้เเบบนั้น¶
Kmeans วงกลม gmm วงรี หรือลองดูทั้งสองเเล้วเทียบค่า error ก็ได้
Example¶
In [164]:
from sklearn.datasets import make_moons
import matplotlib.pyplot as plt
X, y = make_moons(200, noise=.05, random_state=0)
In [165]:
plt.scatter(X[:, 0], X[:, 1], c=y, s=50);
In [166]:
kmeans = KMeans(2, random_state=0)
kmeans.fit(X)
labels = kmeans.predict(X)
In [167]:
plt.scatter(X[:, 0], X[:, 1], c=labels, s=50);
In [168]:
plot_kmeans(kmeans, X)
In [169]:
gmm2 = GaussianMixture(n_components=2, covariance_type='full', random_state=0)
plot_gmm(gmm2, X)
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) Cell In[169], line 2 1 gmm2 = GaussianMixture(n_components=2, covariance_type='full', random_state=0) ----> 2 plot_gmm(gmm2, X) Cell In[162], line 23, in plot_gmm(gmm, X, label, ax) 21 w_factor = 0.2 / gmm.weights_.max() 22 for pos, covar, w in zip(gmm.means_, gmm.covariances_, gmm.weights_): ---> 23 draw_ellipse(pos, covar, alpha=w * w_factor) Cell In[162], line 12, in draw_ellipse(position, covariance, ax, **kwargs) 10 width, height = 2 * np.sqrt(covariance) 11 for nsig in range(1, 4): ---> 12 ax.add_patch(Ellipse(position, nsig * width, nsig * height,angle, **kwargs)) TypeError: Ellipse.__init__() takes 4 positional arguments but 5 were given
In [ ]:
from sklearn.cluster import SpectralClustering
model = SpectralClustering(n_clusters=2, affinity='nearest_neighbors', assign_labels='kmeans')
labels = model.fit_predict(X)
plt.scatter(X[:, 0], X[:, 1], c=labels, s=50, cmap='viridis');